
import jieba
'''
    这个模块用于对文档进行预处理。
'''
def sep_words_by_path(file_path,
                      using_stopwords=True,
                      stop_words_path='C:/Users/Administrator/Desktop/using_python/stopwords.txt'):
    '''
    这个函数依据文档路径读取文档进行分词。传入参数为文件路径，返回的是一个字符串。
    :param file_path:需要传入一个字符串，表示需要分词的文档的路径。例如'C:/exp.txt'
    :return:返回一次字符串，将多个词语使用空格连接为字符串。
    '''
    ret=''
    if using_stopwords :
        stop_words=[]
        with open(stop_words_path,'r',encoding='gb18030',errors='ignore') as stopwords_file:
            for line in stopwords_file:
                appended=line[:-1]
                stop_words.append(appended)
        with open(file_path,'r',encoding='gb18030',errors='ignore') as file:
            for line in file:
                temp_string=line[:-1]
                body=temp_string.split('\t')
                temp_list=jieba.lcut(body[1])
                removed_list=[]
                for word in temp_list:
                    if word not in stop_words:
                        if len(word)==1 :
                            continue
                        if word.isdigit():
                            continue
                        removed_list.append(word)
                ret+=' '.join(removed_list)+' '
    else:
        with open(file_path,'rb',encoding='gb18030',errors='ignore') as file:
            for line in file:
                temp_string=line[:-1]
                body=temp_string.split(b'\t')
                temp_list=jieba.lcut(body[1])
                ret+=' '.join(temp_list)+' '
    return ret

def sep_words_by_string(string,using_stopwords=True,
                      stop_words_path='C:/Users/Administrator/Desktop/using_python/stopwords.txt'):
    '''
    这个函数依据字符串本身进行分词。传入参数为字符串，返回值也为字符串。
    :param string: 需要分词的字符串
    :return: 返回一个字符串，将多个词语使用空格连接为字符串。
    '''
    ret=' '
    temp_string=string[:]
    if temp_string[-1]=='\n':
        temp_string=string[:-1]
    temp=temp_string.split('\t')
    using_string=temp[1]
    if using_stopwords:
        stop_words=[]
        with open(stop_words_path,'rb',encoding='utf-8',errors='ignore') as stopwords_file:
            for line in stopwords_file:
                appended=line[:-1]
                stop_words.append(appended)
        temp_list=jieba.lcut(using_string)
        removed_list=[]
        for i in temp_list:
            if i not in stop_words:
                if len(i) == 1:
                    continue
                if i.isdigit():
                    continue
                removed_list.append(i)
        ret=' '.join(removed_list)
    else:
        temp_list=jieba.lcut(string)
        ret=' '.join(temp_list)
    return ret
